Data

The data comes from a project known as Tidy Tuesday. We are looking at a data set from a few weeks ago, you can read more about it here.

office_ratings <- 
  readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-17/office_ratings.csv')
## Parsed with column specification:
## cols(
##   season = col_double(),
##   episode = col_double(),
##   title = col_character(),
##   imdb_rating = col_double(),
##   total_votes = col_double(),
##   air_date = col_date(format = "")
## )
# office_ratings <- 
#   read.csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-17/office_ratings.csv', stringsAsFactors = F)

Note that readr::read_csv, uses the read_csv from the readr library, but without loading in the library.

Base R

office_ratings_baseR <-
  read.csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-17/office_ratings.csv', stringsAsFactors = F)

office_ratings_baseR$air_date <- as.Date(office_ratings_baseR$air_date[1:4])
office_ratings
## # A tibble: 188 x 6
##    season episode title             imdb_rating total_votes air_date  
##     <dbl>   <dbl> <chr>                   <dbl>       <dbl> <date>    
##  1      1       1 Pilot                     7.6        3706 2005-03-24
##  2      1       2 Diversity Day             8.3        3566 2005-03-29
##  3      1       3 Health Care               7.9        2983 2005-04-05
##  4      1       4 The Alliance              8.1        2886 2005-04-12
##  5      1       5 Basketball                8.4        3179 2005-04-19
##  6      1       6 Hot Girl                  7.8        2852 2005-04-26
##  7      2       1 The Dundies               8.7        3213 2005-09-20
##  8      2       2 Sexual Harassment         8.2        2736 2005-09-27
##  9      2       3 Office Olympics           8.4        2742 2005-10-04
## 10      2       4 The Fire                  8.4        2713 2005-10-11
## # … with 178 more rows

IMDB Rating

Will begin by looking at ratings, and best way to explore is by plotting the data.

library(ggplot2)
office_ratings %>%
  ggplot(aes(imdb_rating)) +
  geom_histogram()  # or stat_bin
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Split by season
office_ratings %>%
  ggplot(aes(imdb_rating)) +
  geom_histogram() +
  facet_wrap(~season)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Tidy up/make nice
office_ratings %>%
  ggplot(aes(imdb_rating)) +
  geom_histogram(bins = 10, colour = "cornflowerblue") +
  facet_wrap(~season) +
  labs(title = "IMDB Rating by Season", 
         x = "Rating", y = "Count")

# Try different bin size
office_ratings %>%
  ggplot(aes(imdb_rating)) +
  geom_histogram(bins = 30, fill = "cornflowerblue") +
  facet_wrap(~season) +
  labs(title = "IMDB Rating by Season", 
         x = "Rating", y = "Count")

# Improve Labels
my_prefix <- function(string, prefix = "Season ") paste0(prefix, string)
office_ratings %>%
  ggplot(aes(imdb_rating)) +
  geom_histogram(bins = 30, fill = "cornflowerblue") +
  facet_wrap(~season, labeller = as_labeller(my_prefix)) +
  labs(title = "IMDB Rating by Season", 
         x = "Rating", y = "Count") +
  theme_light()

Next will look at boxplots

# Simple boxplot
office_ratings %>%
  ggplot(aes(x = season, y = imdb_rating, group = season)) +
  geom_boxplot()

# Fix labels on x axis
office_ratings %>%
  ggplot(aes(x = season, y = imdb_rating, group = season)) +
  geom_boxplot() +
  labs(title = "Boxplots of IMDB Ratings per Season", 
       y = "IMDB Rating", x = "Season") +
  theme_light() +
  scale_x_continuous(breaks = 1:max(office_ratings$season))

# Try to add colour
office_ratings %>%
  ggplot(aes(x = season, y = imdb_rating, group = season, colour = season)) +
  geom_boxplot() +
  labs(title = "Boxplots of IMDB Ratings per Season", 
       y = "IMDB Rating", x = "Season") +
  theme_light() +
  scale_x_continuous(breaks = 1:max(office_ratings$season))

# Need to change season to factor
# office_ratings$season <- as.factor(office_ratings$season)
# office_ratings$season <- office_ratings %>% mutate(season = as.factor(season))
office_ratings %>% 
  mutate(season = as.factor(season)) %>%
  ggplot(aes(x = season, y = imdb_rating, group = season, colour = season)) +
  geom_boxplot() +
  labs(title = "Boxplots of IMDB Ratings per Season", 
       y = "IMDB Rating", x = "Season") +
  theme_light()

office_ratings %>% 
  mutate(season = as.factor(season)) %>%
  ggplot(aes(x = season, y = imdb_rating, group = season, colour = season)) +
  geom_boxplot() +
  labs(title = "Boxplots of IMDB Ratings per Season", 
       y = "IMDB Rating", x = "Season") +
  theme_light() + theme(legend.position = "none")

Seasons 3 and 4 look good, peaks again at 7. Last season bad apart from 3 outliers.

After Season 7, Micheal (lead character - Steve Carell) leaves the show.

# ?geom_vline

office_ratings %>% 
  mutate(season = as.factor(season)) %>%
  ggplot(aes(x = season, y = imdb_rating, group = season, colour = season)) +
  geom_boxplot() +
  labs(title = "Boxplots of IMDB Ratings per Season", 
       y = "IMDB Rating", x = "Season") +
  theme_light() + theme(legend.position = "none") +
  geom_vline(xintercept=7.5, linetype =  "dashed")

office_ratings %>% 
  mutate(season = as.factor(season)) %>%
  ggplot(aes(x = season, y = imdb_rating, group = season, colour = season)) +
  geom_boxplot() +
  labs(title = "Boxplots of IMDB Ratings per Season", 
       y = "IMDB Rating", x = "Season") +
  theme_light() + theme(legend.position = "none") +
  geom_vline(xintercept = 7.5, linetype =  "dashed") + 
  annotate("text", x = 8, y = 5, label = "Micheal Scott Leaves")

Summary Stats

office_ratings %>%
  group_by(season) %>%
  summarise(mean_rating = mean(imdb_rating),
            highest_rating = max(imdb_rating), 
            lowest_rating = min(imdb_rating),
            variance = var(imdb_rating))
## # A tibble: 9 x 5
##   season mean_rating highest_rating lowest_rating variance
##    <dbl>       <dbl>          <dbl>         <dbl>    <dbl>
## 1      1        8.02            8.4           7.6   0.0937
## 2      2        8.44            9.3           7.9   0.101 
## 3      3        8.57            9.3           8     0.117 
## 4      4        8.6             9.3           7.9   0.155 
## 5      5        8.49            9.6           8.1   0.133 
## 6      6        8.22            9.3           6.8   0.252 
## 7      7        8.32            9.7           7.5   0.350 
## 8      8        7.67            8.2           6.7   0.164 
## 9      9        7.96            9.7           7.1   0.385
library(xtable)
office_ratings %>%
  group_by(season) %>%
  summarise(mean_rating = mean(imdb_rating),
            highest_rating = max(imdb_rating), 
            lowest_rating = min(imdb_rating),
            variance = var(imdb_rating)) %>%
  xtable() %>%
  print(type = "html")
season mean_rating highest_rating lowest_rating variance
1 1.00 8.02 8.40 7.60 0.09
2 2.00 8.44 9.30 7.90 0.10
3 3.00 8.57 9.30 8.00 0.12
4 4.00 8.60 9.30 7.90 0.16
5 5.00 8.49 9.60 8.10 0.13
6 6.00 8.22 9.30 6.80 0.25
7 7.00 8.32 9.70 7.50 0.35
8 8.00 7.67 8.20 6.70 0.16
9 9.00 7.96 9.70 7.10 0.39
office_ratings %>%
  group_by(season) %>%
  summarise(mean_rating = mean(imdb_rating),
            highest_rating = max(imdb_rating), 
            lowest_rating = min(imdb_rating),
            variance = var(imdb_rating)) %>%
  xtable() %>%
  print(type = "html", include.rownames = F)
season mean_rating highest_rating lowest_rating variance
1.00 8.02 8.40 7.60 0.09
2.00 8.44 9.30 7.90 0.10
3.00 8.57 9.30 8.00 0.12
4.00 8.60 9.30 7.90 0.16
5.00 8.49 9.60 8.10 0.13
6.00 8.22 9.30 6.80 0.25
7.00 8.32 9.70 7.50 0.35
8.00 7.67 8.20 6.70 0.16
9.00 7.96 9.70 7.10 0.39
office_ratings %>%
  mutate(season = as.factor(season)) %>%
  group_by(season) %>%
  summarise(mean_rating = mean(imdb_rating),
            highest_rating = max(imdb_rating), 
            lowest_rating = min(imdb_rating),
            variance = var(imdb_rating)) %>%
  rename(Season = season,
         Mean = mean_rating, 
         Higest = highest_rating,
         Lowest = lowest_rating,
         Variance = variance) %>%
  xtable(caption = "Summary Statistics of Rating by Season") %>%
  print(type = "html", include.rownames = F)
Summary Statistics of Rating by Season
Season Mean Higest Lowest Variance
1 8.02 8.40 7.60 0.09
2 8.44 9.30 7.90 0.10
3 8.57 9.30 8.00 0.12
4 8.60 9.30 7.90 0.16
5 8.49 9.60 8.10 0.13
6 8.22 9.30 6.80 0.25
7 8.32 9.70 7.50 0.35
8 7.67 8.20 6.70 0.16
9 7.96 9.70 7.10 0.39

Total Votes

office_ratings
## # A tibble: 188 x 6
##    season episode title             imdb_rating total_votes air_date  
##     <dbl>   <dbl> <chr>                   <dbl>       <dbl> <date>    
##  1      1       1 Pilot                     7.6        3706 2005-03-24
##  2      1       2 Diversity Day             8.3        3566 2005-03-29
##  3      1       3 Health Care               7.9        2983 2005-04-05
##  4      1       4 The Alliance              8.1        2886 2005-04-12
##  5      1       5 Basketball                8.4        3179 2005-04-19
##  6      1       6 Hot Girl                  7.8        2852 2005-04-26
##  7      2       1 The Dundies               8.7        3213 2005-09-20
##  8      2       2 Sexual Harassment         8.2        2736 2005-09-27
##  9      2       3 Office Olympics           8.4        2742 2005-10-04
## 10      2       4 The Fire                  8.4        2713 2005-10-11
## # … with 178 more rows
office_ratings %>%
  ggplot(aes(x = air_date, y = total_votes)) +
  geom_line()

office_ratings %>%
  ggplot(aes(x = air_date, y = total_votes)) +
  geom_line(colour = "purple") +
  labs(title = "Number of IMDB Votes Over Time",
       x = "Date", y = "Total Number of Votes")

office_ratings %>%
  mutate(season = as.factor(season)) %>%
  group_by(season) %>%
  summarise(mean_rating = mean(total_votes),
            highest_rating = max(total_votes), 
            lowest_rating = min(total_votes),
            variance = var(total_votes)) %>%
  rename(Season = season,
         Mean = mean_rating, 
         Higest = highest_rating,
         Lowest = lowest_rating,
         Variance = variance) %>%
  xtable(caption = "Summary Statistics of Total Votes by Season") %>%
  print(type = "html", include.rownames = F)
Summary Statistics of Total Votes by Season
Season Mean Higest Lowest Variance
1 3195.33 3706.00 2852.00 131418.27
2 2630.64 3644.00 2323.00 118547.67
3 2443.17 3087.00 2254.00 54387.97
4 2422.57 4095.00 1977.00 314309.19
5 2150.73 5948.00 1808.00 631845.08
6 1856.54 3579.00 1571.00 153722.18
7 2030.96 5749.00 1581.00 819589.52
8 1546.38 1829.00 1393.00 10059.38
9 1852.61 7934.00 1394.00 1854068.43
library(knitr)
office_ratings %>%
  mutate(season = as.factor(season)) %>%
  group_by(season) %>%
  summarise(mean_rating = mean(total_votes),
            highest_rating = max(total_votes), 
            lowest_rating = min(total_votes),
            variance = var(total_votes)) %>%
  rename(Season = season,
         Mean = mean_rating, 
         Higest = highest_rating,
         Lowest = lowest_rating,
         Variance = variance) -> votes_summary_by_season

kable(votes_summary_by_season)
Season Mean Higest Lowest Variance
1 3195.333 3706 2852 131418.27
2 2630.636 3644 2323 118547.67
3 2443.174 3087 2254 54387.97
4 2422.571 4095 1977 314309.19
5 2150.731 5948 1808 631845.08
6 1856.538 3579 1571 153722.18
7 2030.958 5749 1581 819589.52
8 1546.375 1829 1393 10059.38
9 1852.609 7934 1394 1854068.43
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
kable(votes_summary_by_season, caption = "Summary Statistics of Total Votes by Season") %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
Summary Statistics of Total Votes by Season
Season Mean Higest Lowest Variance
1 3195.333 3706 2852 131418.27
2 2630.636 3644 2323 118547.67
3 2443.174 3087 2254 54387.97
4 2422.571 4095 1977 314309.19
5 2150.731 5948 1808 631845.08
6 1856.538 3579 1571 153722.18
7 2030.958 5749 1581 819589.52
8 1546.375 1829 1393 10059.38
9 1852.609 7934 1394 1854068.43
# https://cran.r-project.org/web/packages/kableExtra/vignettes/awesome_table_in_html.html

Votes vs Rating

office_ratings %>%
  ggplot(aes(x = imdb_rating, y = total_votes)) +
  geom_point()

office_ratings %>%
  mutate(season = as.factor(season)) %>%
  ggplot(aes(x = imdb_rating, y = total_votes, colour = season)) +
  geom_point()

office_ratings %>%
  mutate(season = as.factor(season)) %>%
  ggplot(aes(x = imdb_rating, y = total_votes, colour = season)) +
  geom_point() + 
  labs(title = "Votes vs Rating", x = "Rating", y = "Total Number of Votes", colour = "Season")

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
office_ratings %>%
  mutate(season = as.factor(season)) %>%
  ggplot(aes(x = imdb_rating, y = total_votes, colour = season)) +
  geom_point() + 
  labs(title = "Votes vs Rating", 
       x = "Rating", 
       y = "Total Number of Votes", 
       colour = "Season") -> votes_vs_rating
  
  
ggplotly(votes_vs_rating)
votes_vs_rating2 <- 
  office_ratings %>%
  mutate(season = as.factor(season)) %>%
  ggplot(aes(x = imdb_rating, y = total_votes, colour = season, 
             text = paste0(title,
                           "<br>IMDB: ", imdb_rating, 
                           "<br>No. Votes: ", total_votes, 
                           "<br>Season: ", season))) +
  geom_point() + 
  labs(title = "IMDB Votes vs Rating", x = "Rating", y = "Total Number of Votes", colour = "Season")

ggplotly(votes_vs_rating2, tooltip = "text")
votes_vs_rating3 <- 
  office_ratings %>%
  mutate(season = as.factor(season)) %>%
  ggplot(aes(x = imdb_rating, y = total_votes, colour = season, 
             text = paste0("S", season, ".E", episode, " ", title,
                           "<br>IMDB: ", imdb_rating, 
                           "<br>No. Votes: ", total_votes))) +
  geom_point() + 
  labs(title = "IMDB Votes vs Rating", x = "Rating", y = "Total Number of Votes", colour = "Season")

ggplotly(votes_vs_rating3, tooltip = "text")